Exploring Property Assessment and Sales Data for Informed Decision-Making.
#import all the necessary libraries
import numpy as np
import pandas as pd
#for visuals
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
plt.style.use('ggplot')
# import and read files ===> #pd.read_excel() #pd.read_json() #pd.read_tsv()
df = pd.read_csv(r"C:\Users\User1\Downloads\Real_Estate_Sales_2001-2020_GL.csv")
df
C:\Users\User1\AppData\Local\Temp\ipykernel_7100\3880391463.py:2: DtypeWarning: Columns (8,9,10,11,12) have mixed types. Specify dtype option on import or set low_memory=False. df = pd.read_csv(r"C:\Users\User1\Downloads\Real_Estate_Sales_2001-2020_GL.csv")
| Serial Number | List Year | Date Recorded | Town | Address | Assessed Value | Sale Amount | Sales Ratio | Property Type | Residential Type | Non Use Code | Assessor Remarks | OPM remarks | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020348 | 2020 | 09/13/2021 | Ansonia | 230 WAKELEE AVE | 150500.0 | 325000.0 | 0.463000 | Commercial | NaN | NaN | NaN | NaN | NaN |
| 1 | 20002 | 2020 | 10/02/2020 | Ashford | 390 TURNPIKE RD | 253000.0 | 430000.0 | 0.588300 | Residential | Single Family | NaN | NaN | NaN | NaN |
| 2 | 200212 | 2020 | 03/09/2021 | Avon | 5 CHESTNUT DRIVE | 130400.0 | 179900.0 | 0.724800 | Residential | Condo | NaN | NaN | NaN | NaN |
| 3 | 200243 | 2020 | 04/13/2021 | Avon | 111 NORTHINGTON DRIVE | 619290.0 | 890000.0 | 0.695800 | Residential | Single Family | NaN | NaN | NaN | NaN |
| 4 | 200377 | 2020 | 07/02/2021 | Avon | 70 FAR HILLS DRIVE | 862330.0 | 1447500.0 | 0.595700 | Residential | Single Family | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 997208 | 190272 | 2019 | 06/24/2020 | New London | 4 BISHOP CT | 60410.0 | 53100.0 | 1.137665 | Single Family | Single Family | 14 - Foreclosure | NaN | NaN | NaN |
| 997209 | 190284 | 2019 | 11/27/2019 | Waterbury | 126 PERKINS AVE | 68280.0 | 76000.0 | 0.898400 | Single Family | Single Family | 25 - Other | PRIVATE SALE | NaN | NaN |
| 997210 | 190129 | 2019 | 04/27/2020 | Windsor Locks | 19 HATHAWAY ST | 121450.0 | 210000.0 | 0.578300 | Single Family | Single Family | NaN | NaN | NaN | NaN |
| 997211 | 190504 | 2019 | 06/03/2020 | Middletown | 8 BYSTREK DR | 203360.0 | 280000.0 | 0.726300 | Single Family | Single Family | NaN | NaN | NaN | NaN |
| 997212 | 190344 | 2019 | 12/20/2019 | Milford | 250 RESEARCH DR | 4035970.0 | 7450000.0 | 0.541700 | NaN | NaN | NaN | NaN | NaN | NaN |
997213 rows × 14 columns
#shape of the data
df.shape
(997213, 14)
#the info of the data
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 997213 entries, 0 to 997212 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Serial Number 997213 non-null int64 1 List Year 997213 non-null int64 2 Date Recorded 997211 non-null object 3 Town 997213 non-null object 4 Address 997162 non-null object 5 Assessed Value 997213 non-null float64 6 Sale Amount 997213 non-null float64 7 Sales Ratio 997213 non-null float64 8 Property Type 614767 non-null object 9 Residential Type 608904 non-null object 10 Non Use Code 289681 non-null object 11 Assessor Remarks 149864 non-null object 12 OPM remarks 9934 non-null object 13 Location 197697 non-null object dtypes: float64(3), int64(2), object(9) memory usage: 106.5+ MB
# .dtypes ===> for the data types of the columns
df.dtypes
Serial Number int64 List Year int64 Date Recorded object Town object Address object Assessed Value float64 Sale Amount float64 Sales Ratio float64 Property Type object Residential Type object Non Use Code object Assessor Remarks object OPM remarks object Location object dtype: object
# .columns
df.columns.to_list()
['Serial Number', 'List Year', 'Date Recorded', 'Town', 'Address', 'Assessed Value', 'Sale Amount', 'Sales Ratio', 'Property Type', 'Residential Type', 'Non Use Code', 'Assessor Remarks', 'OPM remarks', 'Location']
# check for missing values # .isnull() ===> they return boolean mask.
df.isnull()
| Serial Number | List Year | Date Recorded | Town | Address | Assessed Value | Sale Amount | Sales Ratio | Property Type | Residential Type | Non Use Code | Assessor Remarks | OPM remarks | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | True | True | True | True | True |
| 1 | False | False | False | False | False | False | False | False | False | False | True | True | True | True |
| 2 | False | False | False | False | False | False | False | False | False | False | True | True | True | True |
| 3 | False | False | False | False | False | False | False | False | False | False | True | True | True | True |
| 4 | False | False | False | False | False | False | False | False | False | False | True | True | True | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 997208 | False | False | False | False | False | False | False | False | False | False | False | True | True | True |
| 997209 | False | False | False | False | False | False | False | False | False | False | False | False | True | True |
| 997210 | False | False | False | False | False | False | False | False | False | False | True | True | True | True |
| 997211 | False | False | False | False | False | False | False | False | False | False | True | True | True | True |
| 997212 | False | False | False | False | False | False | False | False | True | True | True | True | True | True |
997213 rows × 14 columns
# sum() with .isnull()
df.isnull().sum()
Serial Number 0 List Year 0 Date Recorded 2 Town 0 Address 51 Assessed Value 0 Sale Amount 0 Sales Ratio 0 Property Type 382446 Residential Type 388309 Non Use Code 707532 Assessor Remarks 847349 OPM remarks 987279 Location 799516 dtype: int64
# Create a heatmap of missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cmap='viridis', cbar=True)
plt.title('Missing Values Heatmap')
plt.show()
# Change the data type of column 'A' to complex
df = df.rename(columns={'Location': 'Geographic Area'})
print(df)
Serial Number List Year Date Recorded Town \
0 2020348 2020 09/13/2021 Ansonia
1 20002 2020 10/02/2020 Ashford
2 200212 2020 03/09/2021 Avon
3 200243 2020 04/13/2021 Avon
4 200377 2020 07/02/2021 Avon
... ... ... ... ...
997208 190272 2019 06/24/2020 New London
997209 190284 2019 11/27/2019 Waterbury
997210 190129 2019 04/27/2020 Windsor Locks
997211 190504 2019 06/03/2020 Middletown
997212 190344 2019 12/20/2019 Milford
Address Assessed Value Sale Amount Sales Ratio \
0 230 WAKELEE AVE 150500.0 325000.0 0.463000
1 390 TURNPIKE RD 253000.0 430000.0 0.588300
2 5 CHESTNUT DRIVE 130400.0 179900.0 0.724800
3 111 NORTHINGTON DRIVE 619290.0 890000.0 0.695800
4 70 FAR HILLS DRIVE 862330.0 1447500.0 0.595700
... ... ... ... ...
997208 4 BISHOP CT 60410.0 53100.0 1.137665
997209 126 PERKINS AVE 68280.0 76000.0 0.898400
997210 19 HATHAWAY ST 121450.0 210000.0 0.578300
997211 8 BYSTREK DR 203360.0 280000.0 0.726300
997212 250 RESEARCH DR 4035970.0 7450000.0 0.541700
Property Type Residential Type Non Use Code Assessor Remarks \
0 Commercial NaN NaN NaN
1 Residential Single Family NaN NaN
2 Residential Condo NaN NaN
3 Residential Single Family NaN NaN
4 Residential Single Family NaN NaN
... ... ... ... ...
997208 Single Family Single Family 14 - Foreclosure NaN
997209 Single Family Single Family 25 - Other PRIVATE SALE
997210 Single Family Single Family NaN NaN
997211 Single Family Single Family NaN NaN
997212 NaN NaN NaN NaN
OPM remarks Geographic Area
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
4 NaN NaN
... ... ...
997208 NaN NaN
997209 NaN NaN
997210 NaN NaN
997211 NaN NaN
997212 NaN NaN
[997213 rows x 14 columns]
# check the columns that are categorical # .select_dtypes()
cat_cols = df.select_dtypes(include = ['category', 'object']).columns.to_list()
cat_cols
['Date Recorded', 'Town', 'Address', 'Property Type', 'Residential Type', 'Non Use Code', 'Assessor Remarks', 'OPM remarks', 'Geographic Area']
# check the columns that are numerical
num_cols = df.select_dtypes(include = ['float', 'int64']).columns.to_list()
num_cols
['Serial Number', 'List Year', 'Assessed Value', 'Sale Amount', 'Sales Ratio']
# value counts for categorical columns
for columns in cat_cols:
print(df[columns].value_counts())
Date Recorded
07/01/2005 877
08/01/2005 859
07/01/2004 840
06/30/2005 828
09/30/2005 781
...
01/21/2006 1
07/07/2012 1
04/14/2012 1
07/28/2012 1
03/07/2020 1
Name: count, Length: 6387, dtype: int64
Town
Bridgeport 34201
Stamford 32529
Waterbury 28506
Norwalk 23960
New Haven 21346
...
Hartland 448
Scotland 430
Canaan 429
Union 261
***Unknown*** 1
Name: count, Length: 170, dtype: int64
Address
MULTI ADDRESSES 620
8 SOUTH RD 420
51 OLD SPRINGFIELD RD 172
312 N BISHOP AVE 114
RIGGS ST 106
...
655 120 TALCOTTVILLE RD 1
414 SALMON BRK ST 1
21 BUTTERNUT KNL 1
59 BURNSIDE AVE UT 2 & 1
126 PERKINS AVE 1
Name: count, Length: 714371, dtype: int64
Property Type
Single Family 401612
Condo 105420
Residential 60728
Two Family 26408
Three Family 12586
Vacant Land 3163
Four Family 2150
Commercial 1981
Apartments 486
Industrial 228
Public Utility 5
Name: count, dtype: int64
Residential Type
Single Family 445016
Condo 117780
Two Family 29609
Three Family 14081
Four Family 2418
Name: count, dtype: int64
Non Use Code
25 - Other 59916
14 - Foreclosure 52445
07 - Change in Property 34248
08 - Part Interest 15496
7.0 14381
...
38.0 1
48.0 1
68.0 1
33 1
75 1
Name: count, Length: 106, dtype: int64
Assessor Remarks
ESTATE SALE 5044
BELOW MARKET 2568
SHORT SALE 2510
NEW CONSTRUCTION 2048
FORECLOSURE 1847
...
LACK OF PARKING 1
estate sale;per verification house substan worse than as of assmt date 1
SOLD W/ GARAGE UNIT G-1 1
sold less than market value 1
MOTIVATED SELLER -DIVORCE 1
Name: count, Length: 66508, dtype: int64
OPM remarks
GOOD SALE PER MLS 978
NO MLS 500
NEW CONSTRUCTION 345
SHORT SALE PER MLS 326
TOTAL RENOVATION PER MLS 316
...
REMODELED PER MLS - SEE PREVIOUS SALE #170076 1
REO SALE - CASH ONLY SOLD AS IS 1
TWO SALES - ALSO SEE #1700209 1
NOT A VALID SALE PER TOWN SITE AND PER MLS SALE PRICE = $345,000 1
PER MLS CLOSING PRICE = $1,145,000 1
Name: count, Length: 4825, dtype: int64
Geographic Area
POINT (-72.36336 41.97461) 181
POINT (-73.41854 41.13449) 145
POINT (-73.45225 41.096) 87
POINT (-73.06359 41.52255) 77
POINT (-72.96095 41.54989) 73
...
POINT (-71.90591 41.34189) 1
POINT (-72.89441 41.8245) 1
POINT (-72.94279 41.69454) 1
POINT (-72.52547 41.26596) 1
POINT (-72.07006 41.53315) 1
Name: count, Length: 130529, dtype: int64
#value counts for numerical columns
for column in num_cols:
print(df[columns].value_counts())
Geographic Area
POINT (-72.36336 41.97461) 181
POINT (-73.41854 41.13449) 145
POINT (-73.45225 41.096) 87
POINT (-73.06359 41.52255) 77
POINT (-72.96095 41.54989) 73
...
POINT (-71.90591 41.34189) 1
POINT (-72.89441 41.8245) 1
POINT (-72.94279 41.69454) 1
POINT (-72.52547 41.26596) 1
POINT (-72.07006 41.53315) 1
Name: count, Length: 130529, dtype: int64
Geographic Area
POINT (-72.36336 41.97461) 181
POINT (-73.41854 41.13449) 145
POINT (-73.45225 41.096) 87
POINT (-73.06359 41.52255) 77
POINT (-72.96095 41.54989) 73
...
POINT (-71.90591 41.34189) 1
POINT (-72.89441 41.8245) 1
POINT (-72.94279 41.69454) 1
POINT (-72.52547 41.26596) 1
POINT (-72.07006 41.53315) 1
Name: count, Length: 130529, dtype: int64
Geographic Area
POINT (-72.36336 41.97461) 181
POINT (-73.41854 41.13449) 145
POINT (-73.45225 41.096) 87
POINT (-73.06359 41.52255) 77
POINT (-72.96095 41.54989) 73
...
POINT (-71.90591 41.34189) 1
POINT (-72.89441 41.8245) 1
POINT (-72.94279 41.69454) 1
POINT (-72.52547 41.26596) 1
POINT (-72.07006 41.53315) 1
Name: count, Length: 130529, dtype: int64
Geographic Area
POINT (-72.36336 41.97461) 181
POINT (-73.41854 41.13449) 145
POINT (-73.45225 41.096) 87
POINT (-73.06359 41.52255) 77
POINT (-72.96095 41.54989) 73
...
POINT (-71.90591 41.34189) 1
POINT (-72.89441 41.8245) 1
POINT (-72.94279 41.69454) 1
POINT (-72.52547 41.26596) 1
POINT (-72.07006 41.53315) 1
Name: count, Length: 130529, dtype: int64
Geographic Area
POINT (-72.36336 41.97461) 181
POINT (-73.41854 41.13449) 145
POINT (-73.45225 41.096) 87
POINT (-73.06359 41.52255) 77
POINT (-72.96095 41.54989) 73
...
POINT (-71.90591 41.34189) 1
POINT (-72.89441 41.8245) 1
POINT (-72.94279 41.69454) 1
POINT (-72.52547 41.26596) 1
POINT (-72.07006 41.53315) 1
Name: count, Length: 130529, dtype: int64
# Convert 'OPM remarks' to string data type
df['OPM remarks'] = df['OPM remarks'].astype(str)
print(df.dtypes)
Serial Number int64 List Year int64 Date Recorded object Town object Address object Assessed Value float64 Sale Amount float64 Sales Ratio float64 Property Type object Residential Type object Non Use Code object Assessor Remarks object OPM remarks object Geographic Area object dtype: object
#check the head of the column
df.head()
| Serial Number | List Year | Date Recorded | Town | Address | Assessed Value | Sale Amount | Sales Ratio | Property Type | Residential Type | Non Use Code | Assessor Remarks | OPM remarks | Geographic Area | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020348 | 2020 | 09/13/2021 | Ansonia | 230 WAKELEE AVE | 150500.0 | 325000.0 | 0.4630 | Commercial | NaN | NaN | NaN | nan | NaN |
| 1 | 20002 | 2020 | 10/02/2020 | Ashford | 390 TURNPIKE RD | 253000.0 | 430000.0 | 0.5883 | Residential | Single Family | NaN | NaN | nan | NaN |
| 2 | 200212 | 2020 | 03/09/2021 | Avon | 5 CHESTNUT DRIVE | 130400.0 | 179900.0 | 0.7248 | Residential | Condo | NaN | NaN | nan | NaN |
| 3 | 200243 | 2020 | 04/13/2021 | Avon | 111 NORTHINGTON DRIVE | 619290.0 | 890000.0 | 0.6958 | Residential | Single Family | NaN | NaN | nan | NaN |
| 4 | 200377 | 2020 | 07/02/2021 | Avon | 70 FAR HILLS DRIVE | 862330.0 | 1447500.0 | 0.5957 | Residential | Single Family | NaN | NaN | nan | NaN |
# fill the missing values with its mode
df['Address'].fillna(df['Address'].mode()[0], inplace = True)
df['Property Type'].fillna(df['Property Type'].mode()[0], inplace = True)
df['Residential Type'].fillna(df['Residential Type'].mode()[0], inplace = True)
df['Non Use Code'].fillna(df['Non Use Code'].mode()[0], inplace = True)
df['Assessor Remarks'].fillna(df['Assessor Remarks'].mode()[0], inplace = True)
df['Geographic Area'].fillna(df['Geographic Area'].mode()[0], inplace = True)
df['Date Recorded'].fillna(df['Date Recorded'].mode()[0], inplace = True)
df['OPM remarks'].fillna(df['OPM remarks'].mode()[0], inplace = True)
# convert the 'Date Recorded' to Datetime
df['Date Recorded'] = pd.to_datetime(df['Date Recorded'])
# check the head of the column
df.head()
| Serial Number | List Year | Date Recorded | Town | Address | Assessed Value | Sale Amount | Sales Ratio | Property Type | Residential Type | Non Use Code | Assessor Remarks | OPM remarks | Geographic Area | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020348 | 2020 | 2021-09-13 | Ansonia | 230 WAKELEE AVE | 150500.0 | 325000.0 | 0.4630 | Commercial | Single Family | 25 - Other | ESTATE SALE | nan | POINT (-72.36336 41.97461) |
| 1 | 20002 | 2020 | 2020-10-02 | Ashford | 390 TURNPIKE RD | 253000.0 | 430000.0 | 0.5883 | Residential | Single Family | 25 - Other | ESTATE SALE | nan | POINT (-72.36336 41.97461) |
| 2 | 200212 | 2020 | 2021-03-09 | Avon | 5 CHESTNUT DRIVE | 130400.0 | 179900.0 | 0.7248 | Residential | Condo | 25 - Other | ESTATE SALE | nan | POINT (-72.36336 41.97461) |
| 3 | 200243 | 2020 | 2021-04-13 | Avon | 111 NORTHINGTON DRIVE | 619290.0 | 890000.0 | 0.6958 | Residential | Single Family | 25 - Other | ESTATE SALE | nan | POINT (-72.36336 41.97461) |
| 4 | 200377 | 2020 | 2021-07-02 | Avon | 70 FAR HILLS DRIVE | 862330.0 | 1447500.0 | 0.5957 | Residential | Single Family | 25 - Other | ESTATE SALE | nan | POINT (-72.36336 41.97461) |
# statistical summary of the data
df.describe()
| Serial Number | List Year | Date Recorded | Assessed Value | Sale Amount | Sales Ratio | |
|---|---|---|---|---|---|---|
| count | 9.972130e+05 | 997213.000000 | 997213 | 9.972130e+05 | 9.972130e+05 | 9.972130e+05 |
| mean | 4.311864e+05 | 2010.189829 | 2011-06-19 11:39:01.489330432 | 2.791437e+05 | 3.911512e+05 | 1.044637e+01 |
| min | 0.000000e+00 | 2001.000000 | 1999-04-05 00:00:00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | 3.044400e+04 | 2004.000000 | 2005-07-25 00:00:00 | 8.760000e+04 | 1.400000e+05 | 4.867000e-01 |
| 50% | 7.030300e+04 | 2010.000000 | 2011-01-19 00:00:00 | 1.383900e+05 | 2.250000e+05 | 6.246000e-01 |
| 75% | 1.518780e+05 | 2016.000000 | 2017-04-05 00:00:00 | 2.255600e+05 | 3.650000e+05 | 7.852761e-01 |
| max | 2.000500e+09 | 2020.000000 | 2021-09-30 00:00:00 | 8.815100e+08 | 5.000000e+09 | 1.226420e+06 |
| std | 6.549219e+06 | 6.237877 | NaN | 1.670610e+06 | 5.347270e+06 | 1.890192e+03 |
# distribution of numerical column
columns_to_visualize = ['Assessed Value', 'Sale Amount', 'Sales Ratio']
# Create a figure and axes object
fig, axs = plt.subplots(1, 3, figsize=(12, 4))
# Plot histograms for each numerical column
axs[0].hist(df['Assessed Value'], bins=20, color='blue')
axs[0].set_title('Assessed Value')
axs[0].set_xlabel('Value')
axs[0].set_ylabel('Frequency')
axs[1].hist(df['Sale Amount'], bins=20, color='green')
axs[1].set_title('Sale Amount')
axs[1].set_xlabel('Value')
axs[1].set_ylabel('Frequency')
axs[2].hist(df['Sales Ratio'], bins=20, color='orange')
axs[2].set_title('Sales Ratio')
axs[2].set_xlabel('Value')
axs[2].set_ylabel('Frequency')
# Adjust spacing between subplots if needed
plt.tight_layout()
# Display the chart
plt.show()
# the number of unique property type
print(f"The number of unique Property type is {df['Property Type'].nunique()} \nThey are as follow: \n{df['Property Type'].unique()}")
The number of unique Property type is 11 They are as follow: ['Commercial' 'Residential' 'Vacant Land' 'Single Family' 'Apartments' 'Industrial' 'Public Utility' 'Condo' 'Two Family' 'Three Family' 'Four Family']
# the number of unique residential type
print(f"The number of unique Residential type is {df['Residential Type'].nunique()} \nThey are as follow: \n{df['Residential Type'].unique()}")
The number of unique Residential type is 5 They are as follow: ['Single Family' 'Condo' 'Two Family' 'Three Family' 'Four Family']
# the number of unique town
print(f"The number of unique Town is {df['Town'].nunique()} \nThey are as follow: \n{df['Town'].unique()}")
The number of unique Town is 170 They are as follow: ['Ansonia' 'Ashford' 'Avon' 'Berlin' 'Bethany' 'Bethel' 'Bethlehem' 'Bloomfield' 'Branford' 'Bristol' 'Brookfield' 'Canaan' 'Canton' 'Cheshire' 'Chester' 'Colchester' 'Columbia' 'Cornwall' 'Coventry' 'Cromwell' 'Danbury' 'Derby' 'Eastford' 'East Haddam' 'East Haven' 'Farmington' 'Chaplin' 'Clinton' 'East Lyme' 'Easton' 'Enfield' 'Essex' 'Durham' 'Franklin' 'Glastonbury' 'Hamden' 'Granby' 'Greenwich' 'Colebrook' 'East Windsor' 'Griswold' 'Bolton' 'Groton' 'Guilford' 'Hartford' 'Harwinton' 'Milford' 'Killingly' 'Killingworth' 'Lebanon' 'Lisbon' 'Litchfield' 'Lyme' 'Manchester' 'Putnam' 'Norwalk' 'Stafford' 'Mansfield' 'Sherman' 'Meriden' 'Stratford' 'Roxbury' 'Oxford' 'Old Lyme' 'Norwich' 'Sharon' 'Monroe' 'Tolland' 'Torrington' 'Naugatuck' 'Ridgefield' 'New London' 'New Britain' 'Orange' 'New Canaan' 'New Fairfield' 'New Hartford' 'New Haven' 'Somers' 'Newtown' 'North Haven' 'West Haven' 'Morris' 'Thompson' 'Stonington' 'Stamford' 'Newington' 'Plainfield' 'Vernon' 'Plainville' 'Watertown' 'West Hartford' 'Plymouth' 'Portland' 'Redding' 'Warren' 'Rocky Hill' 'Salem' 'Winchester' 'Shelton' 'Simsbury' 'Windsor' 'Southbury' 'South Windsor' 'Brooklyn' 'Sterling' 'Ellington' 'Suffield' 'Thomaston' 'East Hartford' 'Trumbull' 'Fairfield' 'Ledyard' 'Washington' 'Waterbury' 'Hebron' 'Barkhamsted' 'Waterford' 'Westbrook' 'Wethersfield' 'Darien' 'Willington' 'Wilton' 'Windsor Locks' 'Wolcott' 'Woodbury' 'Woodstock' 'East Granby' 'Burlington' 'Bozrah' 'Goshen' 'Madison' 'Bridgeport' 'Bridgewater' 'Kent' 'Beacon Falls' 'Andover' 'Hampton' 'Montville' 'Prospect' 'Deep River' 'Southington' 'Norfolk' 'Westport' 'Windham' 'Wallingford' 'Weston' 'Voluntown' 'Middletown' 'Middlefield' 'Middlebury' 'Old Saybrook' 'North Canaan' 'Preston' 'Scotland' 'Sprague' 'Pomfret' 'Seymour' 'Woodbridge' 'Union' 'Haddam' 'Canterbury' 'Marlborough' 'New Milford' 'North Stonington' 'East Hampton' 'Hartland' 'Salisbury' 'North Branford' '***Unknown***']
# top 5 customers with the most purchases
top5_customer = df['Serial Number'].value_counts().head()
top5_customer
Serial Number 10010 172 10018 172 10002 172 10003 171 10009 171 Name: count, dtype: int64
# Calculate the top 5 customers with the most purchases
top5_customer = df['Serial Number'].value_counts().head()
# Create the bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(top5_customer.index, top5_customer.values, color='skyblue')
# Add data labels
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), va='bottom', ha='center')
# Add labels and title
plt.xlabel('Customer Serial Number')
plt.ylabel('Number of Purchases')
plt.title('Top 5 Customers with the Most Purchases')
plt.xticks(rotation=45) # Rotate x-axis labels for better readability
# Show the plot
plt.tight_layout()
plt.show()
# The graph above shows the top 5 customers contributing to the most purchase and we can see similar purchase counts ranging from 171 to 172 purchases each. This consistency suggest that these customers have a regular buying pattern contributing significantly to sales over time.
# Top 10 Town with the most purchases
top10_countries = df['Town'].value_counts()[:10]
top10_countries
Town Bridgeport 34201 Stamford 32529 Waterbury 28506 Norwalk 23960 New Haven 21346 Danbury 20350 West Hartford 19854 Hartford 18810 Milford 17749 Meriden 17502 Name: count, dtype: int64
# Top 10 Towns with the most purchases
top10_towns = df['Town'].value_counts()[:10]
# Plot horizontal bar chart
plt.figure(figsize=(10, 6))
top10_towns.plot(kind='barh', color='skyblue')
plt.xlabel('Number of Purchases')
plt.ylabel('Town')
plt.title('Top 10 Towns with the Most Purchases')
plt.gca().invert_yaxis() # Invert y-axis to have the highest count at the top
plt.show()
# ### Properties with sales ratio significantly above and below 1
# Define threshold for significant deviation
threshold = 1.1 # Adjust as needed based on your analysis
# Identify properties with sales ratios significantly above or below 1
over_assessed = df[df['Sales Ratio'] > threshold]
under_assessed = df[df['Sales Ratio'] < 1 / threshold]
# Print summary statistics or additional information about identified properties
print("Properties with sales ratios significantly above 1 (potential over-assessment):")
print(over_assessed)
print("\nProperties with sales ratios significantly below 1 (potential under-assessment):")
print(under_assessed)
# ### Bivariate Analysis
# - You are considering two features or variables and its visualization to understand the patterns, trends, and the measure of relationship between them.
Properties with sales ratios significantly above 1 (potential over-assessment):
Serial Number List Year Date Recorded Town \
6 2020180 2020 2021-03-01 Berlin
32 200039 2020 2020-11-06 Coventry
50 11238 2001 2002-08-30 Bethel
69 200055 2020 2020-11-19 Essex
197 10110 2001 2002-03-27 Thompson
... ... ... ... ...
997166 19000067 2019 2020-05-19 New Hartford
997172 190060 2019 2019-12-10 Wilton
997190 190253 2019 2020-06-04 New London
997204 190105 2019 2020-02-25 Plymouth
997208 190272 2019 2020-06-24 New London
Address Assessed Value Sale Amount Sales Ratio \
6 1539 FARMINGTON AVE 234200.0 130000.0 1.801500
32 2075 MAIN ST 43700.0 25000.0 1.748000
50 50 FOURTH ST 76450.0 50000.0 1.529000
69 HILLSIDE DR 5600.0 5000.0 1.120000
197 THOMPSON RD 34300.0 5000.0 6.860000
... ... ... ... ...
997166 LOT 2 DINGS RD 87955.0 35000.0 2.513000
997172 72 WEST MEADOE RD 431060.0 385000.0 1.119600
997190 298 CRYSTAL AVE 95130.0 70000.0 1.359000
997204 18 OVERLOOK RD 104130.0 80000.0 1.301625
997208 4 BISHOP CT 60410.0 53100.0 1.137665
Property Type Residential Type Non Use Code \
6 Residential Two Family 08 - Part Interest
32 Vacant Land Single Family 25 - Other
50 Single Family Single Family 8
69 Vacant Land Single Family 12 - Non Buildable Lot
197 Single Family Single Family 6
... ... ... ...
997166 Single Family Single Family 28 - Use Assessment
997172 Single Family Single Family 14 - Foreclosure
997190 Single Family Single Family 10 - A Will
997204 Single Family Single Family 14 - Foreclosure
997208 Single Family Single Family 14 - Foreclosure
Assessor Remarks OPM remarks \
6 ESTATE SALE nan
32 ESTATE SALE nan
50 ESTATE SALE nan
69 EXCESS LAND nan
197 ESTATE SALE nan
... ... ...
997166 ESTATE SALE nan
997172 BANK OWNED AND PRICED FOR IMMEDIATE SALE PER S... nan
997190 ESTATE SALE nan
997204 ESTATE SALE BANK SALE PER MLS
997208 ESTATE SALE nan
Geographic Area
6 POINT (-72.36336 41.97461)
32 POINT (-72.36336 41.97461)
50 POINT (-72.36336 41.97461)
69 POINT (-72.36336 41.97461)
197 POINT (-72.36336 41.97461)
... ...
997166 POINT (-73.02432 41.83788)
997172 POINT (-72.36336 41.97461)
997190 POINT (-72.36336 41.97461)
997204 POINT (-72.98492 41.64753)
997208 POINT (-72.36336 41.97461)
[106884 rows x 14 columns]
Properties with sales ratios significantly below 1 (potential under-assessment):
Serial Number List Year Date Recorded Town \
0 2020348 2020 2021-09-13 Ansonia
1 20002 2020 2020-10-02 Ashford
2 200212 2020 2021-03-09 Avon
3 200243 2020 2021-04-13 Avon
4 200377 2020 2021-07-02 Avon
... ... ... ... ...
997207 190272 2019 2020-08-03 Wilton
997209 190284 2019 2019-11-27 Waterbury
997210 190129 2019 2020-04-27 Windsor Locks
997211 190504 2019 2020-06-03 Middletown
997212 190344 2019 2019-12-20 Milford
Address Assessed Value Sale Amount Sales Ratio \
0 230 WAKELEE AVE 150500.0 325000.0 0.4630
1 390 TURNPIKE RD 253000.0 430000.0 0.5883
2 5 CHESTNUT DRIVE 130400.0 179900.0 0.7248
3 111 NORTHINGTON DRIVE 619290.0 890000.0 0.6958
4 70 FAR HILLS DRIVE 862330.0 1447500.0 0.5957
... ... ... ... ...
997207 145 WHIPSTICK RD 681870.0 1134708.0 0.6009
997209 126 PERKINS AVE 68280.0 76000.0 0.8984
997210 19 HATHAWAY ST 121450.0 210000.0 0.5783
997211 8 BYSTREK DR 203360.0 280000.0 0.7263
997212 250 RESEARCH DR 4035970.0 7450000.0 0.5417
Property Type Residential Type Non Use Code Assessor Remarks \
0 Commercial Single Family 25 - Other ESTATE SALE
1 Residential Single Family 25 - Other ESTATE SALE
2 Residential Condo 25 - Other ESTATE SALE
3 Residential Single Family 25 - Other ESTATE SALE
4 Residential Single Family 25 - Other ESTATE SALE
... ... ... ... ...
997207 Single Family Single Family 25 - Other ESTATE SALE
997209 Single Family Single Family 25 - Other PRIVATE SALE
997210 Single Family Single Family 25 - Other ESTATE SALE
997211 Single Family Single Family 25 - Other ESTATE SALE
997212 Single Family Single Family 25 - Other ESTATE SALE
OPM remarks Geographic Area
0 nan POINT (-72.36336 41.97461)
1 nan POINT (-72.36336 41.97461)
2 nan POINT (-72.36336 41.97461)
3 nan POINT (-72.36336 41.97461)
4 nan POINT (-72.36336 41.97461)
... ... ...
997207 PER MLS CLOSING PRICE = $1,145,000 POINT (-72.36336 41.97461)
997209 nan POINT (-72.36336 41.97461)
997210 nan POINT (-72.36336 41.97461)
997211 nan POINT (-72.36336 41.97461)
997212 nan POINT (-72.36336 41.97461)
[832523 rows x 14 columns]
# Top 10 Town by Sales Ratio
top5_sales_town = df.groupby('Town')['Sales Ratio'].sum().sort_values(ascending = False)[:10]
top5_sales_town
Town Salisbury 1.228449e+06 Newtown 7.865511e+05 New Fairfield 6.159876e+05 Westport 6.024425e+05 East Hartford 5.350223e+05 Brookfield 5.244789e+05 Stamford 4.581180e+05 Bethany 4.217926e+05 Guilford 3.983694e+05 Beacon Falls 3.866651e+05 Name: Sales Ratio, dtype: float64
import plotly.express as px
import pandas as pd
# Create a DataFrame for the top 10 town by sales ratio
top5_sales_product = df.groupby('Town')['Sales Ratio'].sum().sort_values(ascending=False)[:10].reset_index()
# Create a bar chart with data labels
fig = px.bar(
top5_sales_product,
x='Sales Ratio',
y='Town',
text='Sales Ratio', # This adds data labels
labels={'Town': 'Town', 'Sales Ratio': 'Total Sales'},
title='Top 10 Town by Sales Ratio (Bar Chart)',
)
# Customize the appearance of the bar chart
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_xaxes(title_text='')
fig.update_yaxes(categoryorder='total ascending')
# Show the customized bar chart
fig.show()
# Sales trend
sales_trend = df.groupby('List Year')['Sale Amount'].sum()
sales_trend
#Assuming "List Year" is the column representing the year
sales_over_years = df.groupby('List Year')['Sale Amount'].sum().reset_index()
plt.figure(figsize=(10, 6))
plt.plot(sales_over_years['List Year'], sales_over_years['Sale Amount'], marker='o', linestyle='-', color='b')
plt.title('Real Estate Sales Over the Years')
plt.xlabel('Year')
plt.ylabel('Total Sales Amount')
plt.grid(True)
plt.xticks(rotation=45) # Rotate x-axis labels for readability
plt.show()
# Scatter plot of sales amount vs. assessed value
plt.figure(figsize=(8, 6))
plt.scatter(df['Assessed Value'], df['Sale Amount'], alpha=0.5)
plt.title('Sale Amount vs. Assessed Value')
plt.xlabel('Assessed Value')
plt.ylabel('Sale Amount')
plt.grid(True)
plt.show()
# Calculate Pearson correlation coefficient
correlation_coefficient = df['Assessed Value'].corr(df['Sale Amount'])
print(f"Pearson correlation coefficient: {correlation_coefficient}")
Pearson correlation coefficient: 0.11096153179901766
# Set style
sns.set(style="whitegrid")
# Create subplots for each property type
fig, axes = plt.subplots(3, 1, figsize=(12, 19))
# Plot violin plots for assessment values by property type
sns.violinplot(x='Property Type', y='Assessed Value', data=df, ax=axes[0])
axes[0].set_title('Distribution of Assessed Values by Property Type')
axes[0].set_ylabel('Assessed Value')
# Plot violin plots for sales amounts by property type
sns.violinplot(x='Property Type', y='Sale Amount', data=df, ax=axes[1])
axes[1].set_title('Distribution of Sales Amounts by Property Type')
axes[1].set_ylabel('Sale Amount')
# Plot violin plots for sales ratios by property type
sns.violinplot(x='Property Type', y='Sales Ratio', data=df, ax=axes[2])
axes[2].set_title('Distribution of Sales Ratios by Property Type')
axes[2].set_ylabel('Sales Ratio')
plt.show()
# Calculate average assessment value and sale amount for each non-use code
non_use_code_stats = df.groupby('Non Use Code')[['Assessed Value', 'Sale Amount']].mean()
# Sort by average assessment value and sale amount
top20_assessed_value = non_use_code_stats['Assessed Value'].nlargest(20)
top20_sale_amount = non_use_code_stats['Sale Amount'].nlargest(20)
# Plot top 20 distribution of assessment values by non-use code
plt.figure(figsize=(12, 20))
top20_assessed_value.plot(kind='bar', color='skyblue')
plt.title('Top 20 Distribution of Assessment Values by Non-Use Code')
plt.xlabel('Non-Use Code')
plt.ylabel('Average Assessment Value')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Plot top 20 distribution of sale amounts by non-use code
plt.figure(figsize=(12, 20))
top20_sale_amount.plot(kind='bar', color='lightgreen')
plt.title('Top 20 Distribution of Sale Amounts by Non-Use Code')
plt.xlabel('Non-Use Code')
plt.ylabel('Average Sale Amount')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Convert 'date recorded' column to datetime format
df['Date Recorded'] = pd.to_datetime(df['Date Recorded'])
# Extract year from 'date recorded' column
df['Year'] = df['Date Recorded'].dt.year
# Group by year and sum the sales amounts
total_sales_by_year = df.groupby('Year')['Sale Amount'].sum()
# Line plot
plt.figure(figsize=(10, 6))
total_sales_by_year.plot(kind='line', marker='o', color='skyblue')
plt.title('Total Sales Generated by Year')
plt.xlabel('Year')
plt.ylabel('Total Sales Amount')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
# Create a correlation matrix of the numerical columns
corr_matrix = df[['Assessed Value', 'Sale Amount', 'Sales Ratio']].corr()
# Create a customized correlation matrix using Plotly Express
fig = px.imshow(
corr_matrix,
x=['Assessed Value', 'Sale Amount', 'Sales Ratio'],
y=['Assessed Value', 'Sale Amount', 'Sales Ratio'],
title='Correlation Matrix of Numerical Columns',
)
# Customize the color scale and axis labels
fig.update_xaxes(title_text='Columns')
fig.update_yaxes(title_text='Columns')
fig.update_layout(coloraxis_showscale=False) # Hide the color scale
# Add correlation values to the matrix
corr_values = np.around(corr_matrix.values, 2) # Round the values to two decimal places
annotations = []
for i in range(len(corr_matrix.columns)):
for j in range(len(corr_matrix.columns)):
annotations.append(
dict(
x=corr_matrix.columns[i],
y=corr_matrix.columns[j],
text=str(corr_values[i, j]),
showarrow=False,
)
)
fig.update_layout(
annotations=annotations,
)
# Show the correlation matrix
fig.show()
df.to_csv('Updated_Real_Estate.csv', index=False) # Replace the file_name.xlsv with the desired file name
pip install python-pptx
Requirement already satisfied: python-pptx in c:\users\user1\anaconda3\lib\site-packages (0.6.23)Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: lxml>=3.1.0 in c:\users\user1\anaconda3\lib\site-packages (from python-pptx) (4.9.3) Requirement already satisfied: Pillow>=3.3.2 in c:\users\user1\anaconda3\lib\site-packages (from python-pptx) (10.0.1) Requirement already satisfied: XlsxWriter>=0.5.7 in c:\users\user1\anaconda3\lib\site-packages (from python-pptx) (3.2.0)
from pptx import Presentation
# Generate your findings and recommendations
findings = """
Findings:
- The sales data shows a steady increase in revenue over the past year.
- Customer satisfaction scores have improved by 10% compared to the previous quarter.
"""
recommendations = """
Recommendations:
- Increase marketing efforts targeting new customer segments.
- Implement a loyalty program to retain existing customers.
"""
# Create a PowerPoint presentation
prs = Presentation()
# Add title slide
slide_layout = prs.slide_layouts[0] # Title slide layout
slide = prs.slides.add_slide(slide_layout)
title = slide.shapes.title
title.text = "Data Analysis Report"
# Add findings slide
slide_layout = prs.slide_layouts[1] # Title and Content layout
slide = prs.slides.add_slide(slide_layout)
title = slide.shapes.title
content = slide.placeholders[1]
title.text = "Findings"
content.text = findings
# Add recommendations slide
slide = prs.slides.add_slide(slide_layout)
title = slide.shapes.title
content = slide.placeholders[1]
title.text = "Recommendations"
content.text = recommendations
# Save the presentation
prs.save("Analysis_Report.pptx")